"""Derive street-scale tree density metrics from Forest Research TOW + NFI data. The Forest Research Trees Outside Woodland release is an Esri File Geodatabase inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact postcode-level metric from the tree polygons, then optionally rolls that up to Price Paid street names so the dashboard can answer "what is this address's street like?" without loading the full geodatabase at runtime. TOW only covers trees *outside* woodland, so the National Forest Inventory (NFI) woodland layer is optionally unioned in. TOW canopy is accumulated by centroid proximity (tiny crowns), while large NFI woodland parcels are accumulated by true buffer-clipped intersection area so they cannot saturate a postcode from mere centroid proximity. """ from __future__ import annotations import argparse import math import shutil import zipfile from pathlib import Path import numpy as np import polars as pl import pyogrio import shapely from scipy.spatial import cKDTree TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb" STREET_TREE_DENSITY_COL = "Street tree density percentile" STREET_TREE_COVERAGE_COL = "Street tree coverage (%)" POSTCODE_DENSITY_COL = "Tree canopy density within {radius}m (%)" POSTCODE_DENSITY_PERCENTILE_COL = "Tree canopy density percentile within {radius}m" POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)" POSTCODE_COUNT_COL = "Tree features within {radius}m" POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)" # National Forest Inventory (NFI) woodland — the geometric complement of TOW. # NFI ships as a zipped shapefile of woodland parcels (>=0.5 ha) in EPSG:27700. # Field names are from the NFI Woodland England 2022 release; re-check on bumps. NFI_CATEGORY_COL = "CATEGORY" NFI_WOODLAND_VALUE = "Woodland" NFI_TYPE_COL = "IFT_IOA" NFI_AREA_HA_COL = "Area_ha" def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path: """Extract the TOW zip and return the extracted .gdb path.""" gdb_path = extract_dir / TOW_GDB_NAME if gdb_path.exists() and not force: print(f"Using existing extracted geodatabase: {gdb_path}") return gdb_path if force and extract_dir.exists(): shutil.rmtree(extract_dir) elif extract_dir.exists(): print(f"Removing incomplete extraction directory: {extract_dir}") shutil.rmtree(extract_dir) tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp") if tmp_dir.exists(): shutil.rmtree(tmp_dir) tmp_dir.mkdir(parents=True) root = tmp_dir.resolve() print(f"Extracting {zip_path} to {extract_dir}...") with zipfile.ZipFile(zip_path) as archive: for member in archive.infolist(): target = (tmp_dir / member.filename).resolve() if root != target and root not in target.parents: raise ValueError(f"Unsafe path in zip archive: {member.filename}") if member.is_dir(): target.mkdir(parents=True, exist_ok=True) continue target.parent.mkdir(parents=True, exist_ok=True) with archive.open(member) as source, target.open("wb") as dest: shutil.copyfileobj(source, dest, length=1024 * 1024) if not (tmp_dir / TOW_GDB_NAME).exists(): raise FileNotFoundError(f"{TOW_GDB_NAME} was not found inside {zip_path}") tmp_dir.rename(extract_dir) print(f"Extracted geodatabase: {gdb_path}") return gdb_path def _tow_dataset_path( zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool ) -> str: if use_vsizip: return f"/vsizip/{zip_path.resolve()}/{TOW_GDB_NAME}" return str(_safe_extract_zip(zip_path, extract_dir, force_extract)) def _safe_extract_zip_dir(zip_path: Path, extract_dir: Path, force: bool) -> Path: """Extract an arbitrary zip into extract_dir and return the directory.""" if extract_dir.exists() and not force: print(f"Using existing extraction directory: {extract_dir}") return extract_dir if extract_dir.exists(): shutil.rmtree(extract_dir) tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp") if tmp_dir.exists(): shutil.rmtree(tmp_dir) tmp_dir.mkdir(parents=True) root = tmp_dir.resolve() print(f"Extracting {zip_path} to {extract_dir}...") with zipfile.ZipFile(zip_path) as archive: for member in archive.infolist(): target = (tmp_dir / member.filename).resolve() if root != target and root not in target.parents: raise ValueError(f"Unsafe path in zip archive: {member.filename}") if member.is_dir(): target.mkdir(parents=True, exist_ok=True) continue target.parent.mkdir(parents=True, exist_ok=True) with archive.open(member) as source, target.open("wb") as dest: shutil.copyfileobj(source, dest, length=1024 * 1024) tmp_dir.rename(extract_dir) print(f"Extracted archive: {extract_dir}") return extract_dir def _nfi_dataset_path( zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool ) -> str: """Resolve the NFI woodland shapefile path, extracting the zip if needed.""" if use_vsizip: return f"/vsizip/{zip_path.resolve()}" extracted = _safe_extract_zip_dir(zip_path, extract_dir, force_extract) shapefiles = sorted(extracted.rglob("*.shp")) if not shapefiles: raise FileNotFoundError(f"No .shp found inside {zip_path}") return str(shapefiles[0]) def _geometry_column(metadata: dict, column_names: list[str]) -> str: """Resolve the geometry column name from pyogrio Arrow metadata.""" geometry_name = metadata.get("geometry_name") if geometry_name: return str(geometry_name) for name in ("wkb_geometry", "geometry", "geom"): if name in column_names: return name return column_names[-1] def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame: points = ( pl.scan_parquet(arcgis_path) .filter(pl.col("ctry25cd") == "E92000001") .filter(pl.col("doterm").is_null()) .select( pl.col("pcds").alias("postcode"), pl.col("east1m").cast(pl.Float64).alias("x"), pl.col("north1m").cast(pl.Float64).alias("y"), ) .drop_nulls(["postcode", "x", "y"]) .unique("postcode") .sort("postcode") ) if max_postcodes is not None: points = points.head(max_postcodes) df = points.collect() print(f"Loaded {df.height:,} active English postcode points") return df def _layers(dataset_path: str, selected_layers: tuple[str, ...] | None) -> list[str]: available = [layer for layer, _geometry_type in pyogrio.list_layers(dataset_path)] if selected_layers is None: return available missing = sorted(set(selected_layers) - set(available)) if missing: raise ValueError(f"Unknown TOW layer(s): {', '.join(missing)}") return [layer for layer in available if layer in selected_layers] def _metric_columns(radius_m: int) -> tuple[str, str, str, str]: return ( POSTCODE_DENSITY_COL.format(radius=radius_m), POSTCODE_AREA_COL.format(radius=radius_m), POSTCODE_COUNT_COL.format(radius=radius_m), POSTCODE_HEIGHT_COL.format(radius=radius_m), ) def _postcode_density_percentile_col(radius_m: int) -> str: return POSTCODE_DENSITY_PERCENTILE_COL.format(radius=radius_m) def _coverage_percentile_expr(column: str, alias: str) -> pl.Expr: """Rank higher tree coverage higher on a 0-100 England-wide percentile scale.""" value = pl.col(column).fill_nan(None) non_null_count = value.count() rank = value.rank("average") return ( pl.when(value.is_null()) .then(None) .when(value == value.min()) .then(0.0) .when(value == value.max()) .then(100.0) .when(non_null_count > 1) .then(((rank - 1) / (non_null_count - 1) * 100).round(1)) .otherwise(100.0) .cast(pl.Float32) .alias(alias) ) def _with_postcode_density_percentiles( postcode_metrics: pl.DataFrame, radius_m: int ) -> pl.DataFrame: density_col, _area_col, _count_col, _height_col = _metric_columns(radius_m) return postcode_metrics.with_columns( _coverage_percentile_expr( density_col, _postcode_density_percentile_col(radius_m), ) ) def _accumulate_tree_metrics( dataset_path: str, points: pl.DataFrame, radius_m: int, batch_size: int, layer_names: tuple[str, ...] | None, max_features_per_layer: int | None, workers: int, canopy_area: np.ndarray, feature_count: np.ndarray, height_weighted_sum: np.ndarray, height_weight: np.ndarray, ) -> None: xy = points.select("x", "y").to_numpy() tree = cKDTree(xy) layers = _layers(dataset_path, layer_names) print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}") columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"] total_features_seen = 0 total_features_used = 0 for layer in layers: info = pyogrio.read_info(dataset_path, layer=layer) print(f"\nLayer {layer}: {info.get('features', 0):,} features") layer_features_seen = 0 with pyogrio.open_arrow( dataset_path, layer=layer, columns=columns, batch_size=batch_size, use_pyarrow=True, ) as (_meta, reader): for batch_index, batch in enumerate(reader, start=1): if max_features_per_layer is not None: remaining = max_features_per_layer - layer_features_seen if remaining <= 0: break if batch.num_rows > remaining: batch = batch.slice(0, remaining) layer_features_seen += batch.num_rows total_features_seen += batch.num_rows names = batch.schema.names area = np.asarray( batch.column(names.index("TOW_Area_M")).to_numpy(zero_copy_only=False), dtype=np.float64, ) height = np.asarray( batch.column(names.index("MEANHT")).to_numpy(zero_copy_only=False), dtype=np.float64, ) geometry = np.asarray( batch.column(names.index("SHAPE")).to_numpy(zero_copy_only=False), dtype=object, ) valid = np.isfinite(area) & (area > 0) if not valid.any(): continue geometry = geometry[valid] area = area[valid] height = height[valid] centroids = shapely.centroid(shapely.from_wkb(geometry)) x = shapely.get_x(centroids) y = shapely.get_y(centroids) valid_xy = np.isfinite(x) & np.isfinite(y) if not valid_xy.any(): continue x = x[valid_xy] y = y[valid_xy] area = area[valid_xy] height = height[valid_xy] nearby = tree.query_ball_point( np.column_stack((x, y)), radius_m, workers=workers ) lengths = np.fromiter( (len(postcode_indexes) for postcode_indexes in nearby), dtype=np.int32, count=len(nearby), ) matching_features = lengths > 0 if matching_features.any(): postcode_indexes = np.concatenate( [indexes for indexes in nearby if indexes] ).astype(np.int64, copy=False) feature_indexes = np.repeat( np.flatnonzero(matching_features), lengths[matching_features] ) np.add.at(canopy_area, postcode_indexes, area[feature_indexes]) np.add.at(feature_count, postcode_indexes, 1) feature_height = height[feature_indexes] valid_height = np.isfinite(feature_height) if valid_height.any(): height_area = area[feature_indexes][valid_height] np.add.at( height_weighted_sum, postcode_indexes[valid_height], feature_height[valid_height] * height_area, ) np.add.at( height_weight, postcode_indexes[valid_height], height_area, ) total_features_used += len(area) if batch_index == 1 or batch_index % 25 == 0: print( f" batch {batch_index:,}: " f"{total_features_seen:,} rows read, " f"{total_features_used:,} features with usable centroids" ) def _postcode_buffers( points: pl.DataFrame, radius_m: int ) -> tuple[np.ndarray, shapely.STRtree]: """Build a radius-r circle for every postcode plus an STRtree over them. Circle index == postcode index, matching the order used by the cKDTree path. """ xy = points.select("x", "y").to_numpy() circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8) return circles, shapely.STRtree(circles) def _add_nfi_batch( geoms: np.ndarray, category: np.ndarray, circles: np.ndarray, tree: shapely.STRtree, canopy_area: np.ndarray, feature_count: np.ndarray, radius_m: int, ) -> None: """Add NFI woodland into the shared arrays by true buffer-clipped area. Unlike the TOW centroid path, this clips each woodland polygon to each nearby postcode circle and adds only area(polygon ∩ circle); a large parcel therefore cannot saturate a postcode from mere centroid proximity, and a buffer-filling parcel whose centroid is outside the radius is not missed. """ keep = (category == NFI_WOODLAND_VALUE) & ~shapely.is_missing(geoms) geoms = geoms[keep] if geoms.size: geoms = geoms[~shapely.is_empty(geoms)] if geoms.size == 0: return # dwithin(polygon, point, r) is true iff the radius-r circle around the # point intersects the polygon -- exactly the candidate set we want. nfi_index, postcode_index = tree.query( geoms, predicate="dwithin", distance=radius_m ) if nfi_index.size == 0: return clipped_area = shapely.area( shapely.intersection(geoms[nfi_index], circles[postcode_index]) ) positive = clipped_area > 0 postcode_index = postcode_index[positive] clipped_area = clipped_area[positive] np.add.at(canopy_area, postcode_index, clipped_area) np.add.at(feature_count, postcode_index, 1) def _accumulate_nfi_metrics( dataset_path: str, circles: np.ndarray, tree: shapely.STRtree, canopy_area: np.ndarray, feature_count: np.ndarray, radius_m: int, batch_size: int, max_nfi_features: int | None, ) -> None: layers = _layers(dataset_path, None) print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}") # Density only needs the woodland flag + geometry; area is clipped from the # postcode buffer, not read from the file. columns = [NFI_CATEGORY_COL] features_seen = 0 for layer in layers: with pyogrio.open_arrow( dataset_path, layer=layer, columns=columns, batch_size=batch_size, use_pyarrow=True, ) as (meta, reader): for batch_index, batch in enumerate(reader, start=1): if max_nfi_features is not None: remaining = max_nfi_features - features_seen if remaining <= 0: break if batch.num_rows > remaining: batch = batch.slice(0, remaining) features_seen += batch.num_rows names = batch.schema.names geometry_column = _geometry_column(meta, names) category = np.asarray( batch.column(names.index(NFI_CATEGORY_COL)).to_numpy( zero_copy_only=False ), dtype=object, ) geometry = np.asarray( batch.column(names.index(geometry_column)).to_numpy( zero_copy_only=False ), dtype=object, ) _add_nfi_batch( shapely.from_wkb(geometry), category, circles, tree, canopy_area, feature_count, radius_m, ) if batch_index == 1 or batch_index % 25 == 0: print(f" NFI batch {batch_index:,}: {features_seen:,} rows read") def _finalize_metrics( points: pl.DataFrame, canopy_area: np.ndarray, feature_count: np.ndarray, height_weighted_sum: np.ndarray, height_weight: np.ndarray, radius_m: int, ) -> pl.DataFrame: n_points = points.height density_col, area_col, count_col, height_col = _metric_columns(radius_m) buffer_area = math.pi * radius_m * radius_m density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0) mean_height = np.divide( height_weighted_sum, height_weight, out=np.full(n_points, np.nan, dtype=np.float64), where=height_weight > 0, ) return pl.DataFrame( { "postcode": points["postcode"], area_col: canopy_area.round(1).astype(np.float32), density_col: density_pct.round(1).astype(np.float32), count_col: feature_count.astype(np.uint32), height_col: np.round(mean_height, 1).astype(np.float32), } ).with_columns( pl.col(height_col).fill_nan(None), ) def _clean_key_expr(column: str) -> pl.Expr: return ( pl.col(column) .fill_null("") .str.to_uppercase() .str.replace_all(r"[^A-Z0-9]+", " ") .str.replace_all(r"\s+", " ") .str.strip_chars() ) def _latest_price_paid_addresses(price_paid_path: Path) -> pl.LazyFrame: return ( pl.scan_parquet(price_paid_path) .select( pl.col("postcode").str.strip_chars().str.to_uppercase().alias("postcode"), "paon", "saon", "street", "locality", "town_city", "district", "county", "date_of_transfer", ) .filter(pl.col("postcode").is_not_null()) .filter(pl.col("street").is_not_null()) .filter(_clean_key_expr("street") != "") .with_columns( pl.concat_str( [pl.col("saon"), pl.col("paon"), pl.col("street")], separator=" ", ignore_nulls=True, ) .str.replace_all(r"\s+", " ") .str.strip_chars() .alias("pp_address"), ) .filter(pl.col("pp_address").is_not_null()) .sort("date_of_transfer") .group_by("postcode", "pp_address", maintain_order=True) .agg( pl.col("street").last(), pl.col("locality").last(), pl.col("town_city").last(), pl.col("district").last(), pl.col("county").last(), ) .with_columns( pl.concat_str( [ _clean_key_expr("street"), _clean_key_expr("town_city"), _clean_key_expr("district"), _clean_key_expr("county"), ], separator="|", ).alias("street_key") ) ) def _weighted_mean_expr(column: str, weight: str) -> pl.Expr: valid = pl.col(column).is_not_null() & ~pl.col(column).is_nan() numerator = pl.when(valid).then(pl.col(column) * pl.col(weight)).sum() denominator = pl.when(valid).then(pl.col(weight)).sum() return pl.when(denominator > 0).then(numerator / denominator).otherwise(None) def _write_street_rollups( postcode_metrics: pl.DataFrame, price_paid_path: Path, output_streets: Path | None, output_addresses: Path | None, radius_m: int, ) -> None: if output_streets is None and output_addresses is None: return density_col, area_col, count_col, height_col = _metric_columns(radius_m) metrics = postcode_metrics.lazy() addresses = _latest_price_paid_addresses(price_paid_path).join( metrics, on="postcode", how="inner" ) per_postcode = ( addresses.group_by( "street_key", "postcode", "street", "locality", "town_city", "district", "county", ) .agg( pl.len().alias("address_count"), pl.col(density_col).first(), pl.col(area_col).first(), pl.col(count_col).first(), pl.col(height_col).first(), ) .collect() ) streets = ( per_postcode.lazy() .group_by("street_key") .agg( pl.col("street").first(), pl.col("locality").first(), pl.col("town_city").first(), pl.col("district").first(), pl.col("county").first(), pl.col("postcode").n_unique().alias("postcode_count"), pl.col("address_count").sum().alias("address_count"), _weighted_mean_expr(density_col, "address_count") .round(1) .cast(pl.Float32) .alias(STREET_TREE_COVERAGE_COL), _weighted_mean_expr(area_col, "address_count") .round(1) .cast(pl.Float32) .alias(f"Street average {area_col}"), _weighted_mean_expr(count_col, "address_count") .round(1) .cast(pl.Float32) .alias(f"Street average {count_col}"), _weighted_mean_expr(height_col, "address_count") .round(1) .cast(pl.Float32) .alias(f"Street average {height_col}"), ) .with_columns( _coverage_percentile_expr( STREET_TREE_COVERAGE_COL, STREET_TREE_DENSITY_COL, ) ) .sort("street_key") .collect() ) if output_addresses is not None: output_addresses.parent.mkdir(parents=True, exist_ok=True) address_output = addresses.join( streets.lazy().select( "street_key", STREET_TREE_COVERAGE_COL, STREET_TREE_DENSITY_COL, ), on="street_key", how="left", ) address_output.sink_parquet(output_addresses, compression="zstd") print(f"Wrote address tree-density join: {output_addresses}") if output_streets is not None: output_streets.parent.mkdir(parents=True, exist_ok=True) streets.write_parquet(output_streets, compression="zstd") print(f"Wrote street tree-density rollup: {output_streets}") def _parse_csv_arg(value: str | None) -> tuple[str, ...] | None: if value is None: return None if value.lower() == "all": return None parts = tuple(part.strip() for part in value.split(",") if part.strip()) return parts or None def main() -> None: parser = argparse.ArgumentParser( description="Build postcode and street tree-density metrics from FR_TOW_V1_ALL.zip" ) parser.add_argument( "--tow-zip", type=Path, default=Path("property-data/FR_TOW_V1_ALL.zip"), help="Forest Research TOW zip containing FR_TOW_V1_ALL.gdb", ) parser.add_argument( "--extract-dir", type=Path, default=Path("property-data/fr_tow_v1_all"), help="Directory where the zip is extracted", ) parser.add_argument( "--force-extract", action="store_true", help="Re-extract the TOW zip even if the geodatabase already exists", ) parser.add_argument( "--use-vsizip", action="store_true", help="Read the geodatabase directly from the zip instead of extracting it", ) parser.add_argument( "--nfi-zip", type=Path, default=Path("property-data/NFI_WOODLAND_ENGLAND.zip"), help="Optional NFI woodland shapefile zip to union with TOW (skipped if absent)", ) parser.add_argument( "--nfi-extract-dir", type=Path, default=Path("property-data/nfi_woodland_england"), help="Directory where the NFI zip is extracted", ) parser.add_argument( "--arcgis", type=Path, default=Path("property-data/arcgis_data.parquet"), help="Postcode centroid parquet with east1m/north1m columns", ) parser.add_argument( "--price-paid", type=Path, default=None, help="Optional Price Paid parquet used to roll postcode metrics up to streets", ) parser.add_argument( "--output-postcodes", type=Path, required=True, help="Output postcode-level tree-density parquet", ) parser.add_argument( "--output-streets", type=Path, default=None, help="Optional output street-level tree-density parquet", ) parser.add_argument( "--output-addresses", type=Path, default=None, help="Optional output address/street join parquet keyed by postcode and pp_address", ) parser.add_argument( "--radius-m", type=int, default=50, help="Radius around each postcode centroid used as the street-scale buffer", ) parser.add_argument( "--layers", default=None, help="Optional comma-separated subset of TOW layers for testing", ) parser.add_argument( "--batch-size", type=int, default=65_536, help="Arrow batch size for reading TOW features", ) parser.add_argument( "--workers", type=int, default=-1, help="Worker count passed to scipy cKDTree.query_ball_point", ) parser.add_argument( "--max-postcodes", type=int, default=None, help="Testing only: process the first N postcode points", ) parser.add_argument( "--max-features-per-layer", type=int, default=None, help="Testing only: process at most N TOW features per layer", ) parser.add_argument( "--max-nfi-features", type=int, default=None, help="Testing only: process at most N NFI woodland features", ) args = parser.parse_args() if (args.output_streets or args.output_addresses) and args.price_paid is None: raise SystemExit("--price-paid is required when writing street/address outputs") if args.radius_m <= 0: raise SystemExit("--radius-m must be greater than zero") dataset_path = _tow_dataset_path( args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip ) points = _postcode_points(args.arcgis, args.max_postcodes) layer_names = _parse_csv_arg(args.layers) n_points = points.height canopy_area = np.zeros(n_points, dtype=np.float64) feature_count = np.zeros(n_points, dtype=np.uint32) height_weighted_sum = np.zeros(n_points, dtype=np.float64) height_weight = np.zeros(n_points, dtype=np.float64) _accumulate_tree_metrics( dataset_path=dataset_path, points=points, radius_m=args.radius_m, batch_size=args.batch_size, layer_names=layer_names, max_features_per_layer=args.max_features_per_layer, workers=args.workers, canopy_area=canopy_area, feature_count=feature_count, height_weighted_sum=height_weighted_sum, height_weight=height_weight, ) if args.nfi_zip is not None and args.nfi_zip.exists(): nfi_path = _nfi_dataset_path( args.nfi_zip, args.nfi_extract_dir, args.force_extract, args.use_vsizip ) circles, nfi_tree = _postcode_buffers(points, args.radius_m) _accumulate_nfi_metrics( dataset_path=nfi_path, circles=circles, tree=nfi_tree, canopy_area=canopy_area, feature_count=feature_count, radius_m=args.radius_m, batch_size=args.batch_size, max_nfi_features=args.max_nfi_features, ) elif args.nfi_zip is not None: print(f"NFI zip not found, skipping woodland union: {args.nfi_zip}") postcode_metrics = _finalize_metrics( points, canopy_area, feature_count, height_weighted_sum, height_weight, args.radius_m, ) postcode_metrics = _with_postcode_density_percentiles( postcode_metrics, args.radius_m ) args.output_postcodes.parent.mkdir(parents=True, exist_ok=True) postcode_metrics.write_parquet(args.output_postcodes, compression="zstd") print(f"\nWrote postcode tree-density metrics: {args.output_postcodes}") if args.price_paid is not None: _write_street_rollups( postcode_metrics=postcode_metrics, price_paid_path=args.price_paid, output_streets=args.output_streets, output_addresses=args.output_addresses, radius_m=args.radius_m, ) if __name__ == "__main__": main()