improve

2026-05-31 20:20:41 +01:00 · 2026-05-31 20:20:41 +01:00 · e8345cbdc1
commit e8345cbdc1
parent 8688b7475e
40 changed files with 1980 additions and 904 deletions
--- a/pipeline/transform/tree_density.py
+++ b/pipeline/transform/tree_density.py
@ -1,16 +1,28 @@
-"""Derive street-scale tree density metrics from Forest Research TOW + NFI data.
+"""Derive postcode-scale tree density metrics from Forest Research TOW + NFI data.

 The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
 inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
-postcode-level metric from the tree polygons, then optionally rolls that up to
-Price Paid street names so the dashboard can answer "what is this address's
-street like?" without loading the full geodatabase at runtime.
+postcode-level metric from the tree polygons so the dashboard can answer "how
+green is this postcode?" without loading the full geodatabase at runtime.

-TOW only covers trees *outside* woodland, so the National Forest Inventory (NFI)
-woodland layer is optionally unioned in. TOW canopy is accumulated by centroid
-proximity (tiny crowns), while large NFI woodland parcels are accumulated by
-true buffer-clipped intersection area so they cannot saturate a postcode from
-mere centroid proximity.
+Every postcode centroid is expanded into a radius-r buffer ("extended area").
+Both TOW tree crowns and National Forest Inventory (NFI) woodland parcels are
+accumulated by *true buffer-clipped intersection area*: only the part of each
+polygon that falls inside a postcode's buffer is counted, never the area that
+spills outside it. A crown straddling the buffer edge therefore contributes only
+its inside portion, and a parcel reaching into the buffer from outside is still
+counted -- no polygon can saturate a postcode from mere proximity.
+
+TOW only covers trees *outside* woodland, so the NFI woodland layer is the
+geometric complement of TOW and is optionally unioned in. The two products are
+*assumed disjoint*: clipped TOW crown area and clipped NFI woodland area are
+summed into the same per-postcode accumulator, so any spatial overlap between a
+TOW crown and an NFI parcel (boundary slop where "groups of trees" meet
+"woodland") would be double-counted. The final density is capped at 100% and
+_finalize_metrics logs how many postcodes exceed 100% raw coverage, which is a
+direct symptom of such overlap (or of overlapping crowns within one buffer); if
+that count is material the products are not disjoint and the NFI clip should be
+taken against the complement of TOW.
 """

 from __future__ import annotations
@ -25,16 +37,12 @@ import numpy as np
 import polars as pl
 import pyogrio
 import shapely
-from scipy.spatial import cKDTree


 TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
-STREET_TREE_DENSITY_COL = "Street tree density percentile"
-STREET_TREE_COVERAGE_COL = "Street tree coverage (%)"
 POSTCODE_DENSITY_COL = "Tree canopy density within {radius}m (%)"
 POSTCODE_DENSITY_PERCENTILE_COL = "Tree canopy density percentile within {radius}m"
 POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)"
-POSTCODE_COUNT_COL = "Tree features within {radius}m"
 POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"

 # National Forest Inventory (NFI) woodland — the geometric complement of TOW.
@ -131,13 +139,24 @@ def _safe_extract_zip_dir(zip_path: Path, extract_dir: Path, force: bool) -> Pat
 def _nfi_dataset_path(
    zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
 ) -> str:
-    """Resolve the NFI woodland shapefile path, extracting the zip if needed."""
+    """Resolve the NFI woodland shapefile path, extracting the zip if needed.
+
+    Raises if the archive contains zero or more than one shapefile rather than
+    silently picking one, so an ambiguous NFI release fails loudly instead of
+    accumulating canopy from the wrong layer.
+    """
    if use_vsizip:
        return f"/vsizip/{zip_path.resolve()}"
    extracted = _safe_extract_zip_dir(zip_path, extract_dir, force_extract)
    shapefiles = sorted(extracted.rglob("*.shp"))
    if not shapefiles:
        raise FileNotFoundError(f"No .shp found inside {zip_path}")
+    if len(shapefiles) > 1:
+        names = ", ".join(path.name for path in shapefiles)
+        raise ValueError(
+            f"Expected exactly one shapefile inside {zip_path}, found {len(shapefiles)} "
+            f"({names}); cannot unambiguously pick the NFI woodland layer"
+        )
    return str(shapefiles[0])


@ -146,7 +165,7 @@ def _geometry_column(metadata: dict, column_names: list[str]) -> str:
    geometry_name = metadata.get("geometry_name")
    if geometry_name:
        return str(geometry_name)
-    for name in ("wkb_geometry", "geometry", "geom"):
+    for name in ("wkb_geometry", "geometry", "geom", "SHAPE"):
        if name in column_names:
            return name
    return column_names[-1]
@ -184,11 +203,10 @@ def _layers(dataset_path: str, selected_layers: tuple[str, ...] | None) -> list[
    return [layer for layer in available if layer in selected_layers]


-def _metric_columns(radius_m: int) -> tuple[str, str, str, str]:
+def _metric_columns(radius_m: int) -> tuple[str, str, str]:
    return (
        POSTCODE_DENSITY_COL.format(radius=radius_m),
        POSTCODE_AREA_COL.format(radius=radius_m),
-        POSTCODE_COUNT_COL.format(radius=radius_m),
        POSTCODE_HEIGHT_COL.format(radius=radius_m),
    )

@ -198,20 +216,23 @@ def _postcode_density_percentile_col(radius_m: int) -> str:


 def _coverage_percentile_expr(column: str, alias: str) -> pl.Expr:
-    """Rank higher tree coverage higher on a 0-100 England-wide percentile scale."""
+    """Rank tree coverage on a 0-100 England-wide percentile scale.
+
+    A single tie-consistent average-rank formula is used for every value so the
+    scale is internally consistent end to end: tied values share their mean rank,
+    so the lowest coverage maps toward 0 and the highest toward 100 only when they
+    are not themselves tied. An all-equal (or single-value) column has no spread
+    and maps to the neutral midpoint (50).
+    """
    value = pl.col(column).fill_nan(None)
    non_null_count = value.count()
    rank = value.rank("average")
    return (
        pl.when(value.is_null())
        .then(None)
-        .when(value == value.min())
-        .then(0.0)
-        .when(value == value.max())
-        .then(100.0)
        .when(non_null_count > 1)
        .then(((rank - 1) / (non_null_count - 1) * 100).round(1))
-        .otherwise(100.0)
+        .otherwise(50.0)
        .cast(pl.Float32)
        .alias(alias)
    )
@ -220,7 +241,7 @@ def _coverage_percentile_expr(column: str, alias: str) -> pl.Expr:
 def _with_postcode_density_percentiles(
    postcode_metrics: pl.DataFrame, radius_m: int
 ) -> pl.DataFrame:
-    density_col, _area_col, _count_col, _height_col = _metric_columns(radius_m)
+    density_col, _area_col, _height_col = _metric_columns(radius_m)
    return postcode_metrics.with_columns(
        _coverage_percentile_expr(
            density_col,
@ -229,28 +250,88 @@ def _with_postcode_density_percentiles(
    )


-def _accumulate_tree_metrics(
+def _postcode_buffers(
+    points: pl.DataFrame, radius_m: int
+) -> tuple[np.ndarray, shapely.STRtree]:
+    """Build a radius-r circle for every postcode plus an STRtree over them.
+
+    Circle index == postcode index, so an STRtree match resolves directly to the
+    postcode accumulator slot.
+    """
+    xy = points.select("x", "y").to_numpy()
+    circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
+    return circles, shapely.STRtree(circles)
+
+
+def _accumulate_clipped_area(
+    geoms: np.ndarray,
+    circles: np.ndarray,
+    tree: shapely.STRtree,
+    canopy_area: np.ndarray,
+    height: np.ndarray | None = None,
+    height_weighted_sum: np.ndarray | None = None,
+    height_weight: np.ndarray | None = None,
+) -> None:
+    """Add each polygon's in-buffer overlap area to every postcode it intersects.
+
+    Only area(polygon ∩ circle) is accumulated -- never the area of the polygon
+    that falls outside the postcode's extended buffer -- so a crown straddling
+    the buffer edge contributes only its inside portion and a large parcel cannot
+    saturate a postcode from mere proximity. When ``height`` is supplied the mean
+    feature height is accumulated weighted by that same clipped overlap area.
+    """
+    keep = ~shapely.is_missing(geoms) & ~shapely.is_empty(geoms)
+    geoms = geoms[keep]
+    if height is not None:
+        height = height[keep]
+    if geoms.size == 0:
+        return
+
+    # query(predicate="intersects") over the circle STRtree returns exactly the
+    # (polygon, circle) pairs whose clipped overlap can be positive -- i.e. the
+    # polygon overlaps that postcode's radius-r buffer.
+    geom_index, postcode_index = tree.query(geoms, predicate="intersects")
+    if geom_index.size == 0:
+        return
+
+    clipped_area = shapely.area(
+        shapely.intersection(geoms[geom_index], circles[postcode_index])
+    )
+    positive = clipped_area > 0
+    geom_index = geom_index[positive]
+    postcode_index = postcode_index[positive]
+    clipped_area = clipped_area[positive]
+
+    np.add.at(canopy_area, postcode_index, clipped_area)
+
+    if height is not None:
+        feature_height = height[geom_index]
+        finite = np.isfinite(feature_height)
+        if finite.any():
+            np.add.at(
+                height_weighted_sum,
+                postcode_index[finite],
+                feature_height[finite] * clipped_area[finite],
+            )
+            np.add.at(height_weight, postcode_index[finite], clipped_area[finite])
+
+
+def _accumulate_tow_metrics(
    dataset_path: str,
-    points: pl.DataFrame,
-    radius_m: int,
+    circles: np.ndarray,
+    tree: shapely.STRtree,
+    canopy_area: np.ndarray,
+    height_weighted_sum: np.ndarray,
+    height_weight: np.ndarray,
    batch_size: int,
    layer_names: tuple[str, ...] | None,
    max_features_per_layer: int | None,
-    workers: int,
-    canopy_area: np.ndarray,
-    feature_count: np.ndarray,
-    height_weighted_sum: np.ndarray,
-    height_weight: np.ndarray,
 ) -> None:
-    xy = points.select("x", "y").to_numpy()
-    tree = cKDTree(xy)
-
    layers = _layers(dataset_path, layer_names)
    print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")

-    columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"]
+    columns = ["MEANHT"]
    total_features_seen = 0
-    total_features_used = 0

    for layer in layers:
        info = pyogrio.read_info(dataset_path, layer=layer)
@ -263,7 +344,7 @@ def _accumulate_tree_metrics(
            columns=columns,
            batch_size=batch_size,
            use_pyarrow=True,
-        ) as (_meta, reader):
+        ) as (meta, reader):
            for batch_index, batch in enumerate(reader, start=1):
                if max_features_per_layer is not None:
                    remaining = max_features_per_layer - layer_features_seen
@ -275,135 +356,29 @@ def _accumulate_tree_metrics(
                layer_features_seen += batch.num_rows
                total_features_seen += batch.num_rows
                names = batch.schema.names
-                area = np.asarray(
-                    batch.column(names.index("TOW_Area_M")).to_numpy(zero_copy_only=False),
-                    dtype=np.float64,
-                )
+                geometry_column = _geometry_column(meta, names)
                height = np.asarray(
                    batch.column(names.index("MEANHT")).to_numpy(zero_copy_only=False),
                    dtype=np.float64,
                )
                geometry = np.asarray(
-                    batch.column(names.index("SHAPE")).to_numpy(zero_copy_only=False),
+                    batch.column(names.index(geometry_column)).to_numpy(
+                        zero_copy_only=False
+                    ),
                    dtype=object,
                )
-
-                valid = np.isfinite(area) & (area > 0)
-                if not valid.any():
-                    continue
-
-                geometry = geometry[valid]
-                area = area[valid]
-                height = height[valid]
-
-                centroids = shapely.centroid(shapely.from_wkb(geometry))
-                x = shapely.get_x(centroids)
-                y = shapely.get_y(centroids)
-                valid_xy = np.isfinite(x) & np.isfinite(y)
-                if not valid_xy.any():
-                    continue
-
-                x = x[valid_xy]
-                y = y[valid_xy]
-                area = area[valid_xy]
-                height = height[valid_xy]
-
-                nearby = tree.query_ball_point(
-                    np.column_stack((x, y)), radius_m, workers=workers
+                _accumulate_clipped_area(
+                    shapely.from_wkb(geometry),
+                    circles,
+                    tree,
+                    canopy_area,
+                    height=height,
+                    height_weighted_sum=height_weighted_sum,
+                    height_weight=height_weight,
                )
-                lengths = np.fromiter(
-                    (len(postcode_indexes) for postcode_indexes in nearby),
-                    dtype=np.int32,
-                    count=len(nearby),
-                )
-                matching_features = lengths > 0
-                if matching_features.any():
-                    postcode_indexes = np.concatenate(
-                        [indexes for indexes in nearby if indexes]
-                    ).astype(np.int64, copy=False)
-                    feature_indexes = np.repeat(
-                        np.flatnonzero(matching_features), lengths[matching_features]
-                    )

-                    np.add.at(canopy_area, postcode_indexes, area[feature_indexes])
-                    np.add.at(feature_count, postcode_indexes, 1)
-
-                    feature_height = height[feature_indexes]
-                    valid_height = np.isfinite(feature_height)
-                    if valid_height.any():
-                        height_area = area[feature_indexes][valid_height]
-                        np.add.at(
-                            height_weighted_sum,
-                            postcode_indexes[valid_height],
-                            feature_height[valid_height] * height_area,
-                        )
-                        np.add.at(
-                            height_weight,
-                            postcode_indexes[valid_height],
-                            height_area,
-                        )
-
-                total_features_used += len(area)
                if batch_index == 1 or batch_index % 25 == 0:
-                    print(
-                        f"  batch {batch_index:,}: "
-                        f"{total_features_seen:,} rows read, "
-                        f"{total_features_used:,} features with usable centroids"
-                    )
-
-
-def _postcode_buffers(
-    points: pl.DataFrame, radius_m: int
-) -> tuple[np.ndarray, shapely.STRtree]:
-    """Build a radius-r circle for every postcode plus an STRtree over them.
-
-    Circle index == postcode index, matching the order used by the cKDTree path.
-    """
-    xy = points.select("x", "y").to_numpy()
-    circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
-    return circles, shapely.STRtree(circles)
-
-
-def _add_nfi_batch(
-    geoms: np.ndarray,
-    category: np.ndarray,
-    circles: np.ndarray,
-    tree: shapely.STRtree,
-    canopy_area: np.ndarray,
-    feature_count: np.ndarray,
-    radius_m: int,
-) -> None:
-    """Add NFI woodland into the shared arrays by true buffer-clipped area.
-
-    Unlike the TOW centroid path, this clips each woodland polygon to each
-    nearby postcode circle and adds only area(polygon ∩ circle); a large parcel
-    therefore cannot saturate a postcode from mere centroid proximity, and a
-    buffer-filling parcel whose centroid is outside the radius is not missed.
-    """
-    keep = (category == NFI_WOODLAND_VALUE) & ~shapely.is_missing(geoms)
-    geoms = geoms[keep]
-    if geoms.size:
-        geoms = geoms[~shapely.is_empty(geoms)]
-    if geoms.size == 0:
-        return
-
-    # dwithin(polygon, point, r) is true iff the radius-r circle around the
-    # point intersects the polygon -- exactly the candidate set we want.
-    nfi_index, postcode_index = tree.query(
-        geoms, predicate="dwithin", distance=radius_m
-    )
-    if nfi_index.size == 0:
-        return
-
-    clipped_area = shapely.area(
-        shapely.intersection(geoms[nfi_index], circles[postcode_index])
-    )
-    positive = clipped_area > 0
-    postcode_index = postcode_index[positive]
-    clipped_area = clipped_area[positive]
-
-    np.add.at(canopy_area, postcode_index, clipped_area)
-    np.add.at(feature_count, postcode_index, 1)
+                    print(f"  batch {batch_index:,}: {total_features_seen:,} rows read")


 def _accumulate_nfi_metrics(
@ -411,8 +386,6 @@ def _accumulate_nfi_metrics(
    circles: np.ndarray,
    tree: shapely.STRtree,
    canopy_area: np.ndarray,
-    feature_count: np.ndarray,
-    radius_m: int,
    batch_size: int,
    max_nfi_features: int | None,
 ) -> None:
@ -455,14 +428,12 @@ def _accumulate_nfi_metrics(
                    ),
                    dtype=object,
                )
-                _add_nfi_batch(
-                    shapely.from_wkb(geometry),
-                    category,
+                geoms = shapely.from_wkb(geometry)
+                _accumulate_clipped_area(
+                    geoms[category == NFI_WOODLAND_VALUE],
                    circles,
                    tree,
                    canopy_area,
-                    feature_count,
-                    radius_m,
                )
                if batch_index == 1 or batch_index % 25 == 0:
                    print(f"  NFI batch {batch_index:,}: {features_seen:,} rows read")
@ -471,15 +442,26 @@ def _accumulate_nfi_metrics(
 def _finalize_metrics(
    points: pl.DataFrame,
    canopy_area: np.ndarray,
-    feature_count: np.ndarray,
    height_weighted_sum: np.ndarray,
    height_weight: np.ndarray,
    radius_m: int,
 ) -> pl.DataFrame:
    n_points = points.height
-    density_col, area_col, count_col, height_col = _metric_columns(radius_m)
+    density_col, area_col, height_col = _metric_columns(radius_m)
    buffer_area = math.pi * radius_m * radius_m
-    density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0)
+    raw_density = canopy_area / buffer_area * 100.0
+    density_pct = np.minimum(raw_density, 100.0)
+
+    # Symptom of the assumed-disjoint TOW/NFI union being violated (or of
+    # overlapping crowns inside one buffer): clipped areas alone cannot exceed the
+    # buffer unless polygons overlap. Surface it rather than hide it behind the cap.
+    over_count = int(np.count_nonzero(raw_density > 100.0))
+    if over_count:
+        print(
+            f"  note: {over_count:,} postcode(s) exceeded 100% raw canopy and were "
+            "capped — indicates overlapping TOW/NFI canopy within the buffer"
+        )
+
    mean_height = np.divide(
        height_weighted_sum,
        height_weight,
@ -492,7 +474,6 @@ def _finalize_metrics(
            "postcode": points["postcode"],
            area_col: canopy_area.round(1).astype(np.float32),
            density_col: density_pct.round(1).astype(np.float32),
-            count_col: feature_count.astype(np.uint32),
            height_col: np.round(mean_height, 1).astype(np.float32),
        }
    ).with_columns(
@ -500,181 +481,9 @@ def _finalize_metrics(
    )


-def _clean_key_expr(column: str) -> pl.Expr:
-    return (
-        pl.col(column)
-        .fill_null("")
-        .str.to_uppercase()
-        .str.replace_all(r"[^A-Z0-9]+", " ")
-        .str.replace_all(r"\s+", " ")
-        .str.strip_chars()
-    )
-
-
-def _latest_price_paid_addresses(price_paid_path: Path) -> pl.LazyFrame:
-    return (
-        pl.scan_parquet(price_paid_path)
-        .select(
-            pl.col("postcode").str.strip_chars().str.to_uppercase().alias("postcode"),
-            "paon",
-            "saon",
-            "street",
-            "locality",
-            "town_city",
-            "district",
-            "county",
-            "date_of_transfer",
-        )
-        .filter(pl.col("postcode").is_not_null())
-        .filter(pl.col("street").is_not_null())
-        .filter(_clean_key_expr("street") != "")
-        .with_columns(
-            pl.concat_str(
-                [pl.col("saon"), pl.col("paon"), pl.col("street")],
-                separator=" ",
-                ignore_nulls=True,
-            )
-            .str.replace_all(r"\s+", " ")
-            .str.strip_chars()
-            .alias("pp_address"),
-        )
-        .filter(pl.col("pp_address").is_not_null())
-        .sort("date_of_transfer")
-        .group_by("postcode", "pp_address", maintain_order=True)
-        .agg(
-            pl.col("street").last(),
-            pl.col("locality").last(),
-            pl.col("town_city").last(),
-            pl.col("district").last(),
-            pl.col("county").last(),
-        )
-        .with_columns(
-            pl.concat_str(
-                [
-                    _clean_key_expr("street"),
-                    _clean_key_expr("town_city"),
-                    _clean_key_expr("district"),
-                    _clean_key_expr("county"),
-                ],
-                separator="|",
-            ).alias("street_key")
-        )
-    )
-
-
-def _weighted_mean_expr(column: str, weight: str) -> pl.Expr:
-    valid = pl.col(column).is_not_null() & ~pl.col(column).is_nan()
-    numerator = pl.when(valid).then(pl.col(column) * pl.col(weight)).sum()
-    denominator = pl.when(valid).then(pl.col(weight)).sum()
-    return pl.when(denominator > 0).then(numerator / denominator).otherwise(None)
-
-
-def _write_street_rollups(
-    postcode_metrics: pl.DataFrame,
-    price_paid_path: Path,
-    output_streets: Path | None,
-    output_addresses: Path | None,
-    radius_m: int,
-) -> None:
-    if output_streets is None and output_addresses is None:
-        return
-
-    density_col, area_col, count_col, height_col = _metric_columns(radius_m)
-    metrics = postcode_metrics.lazy()
-    addresses = _latest_price_paid_addresses(price_paid_path).join(
-        metrics, on="postcode", how="inner"
-    )
-
-    per_postcode = (
-        addresses.group_by(
-            "street_key",
-            "postcode",
-            "street",
-            "locality",
-            "town_city",
-            "district",
-            "county",
-        )
-        .agg(
-            pl.len().alias("address_count"),
-            pl.col(density_col).first(),
-            pl.col(area_col).first(),
-            pl.col(count_col).first(),
-            pl.col(height_col).first(),
-        )
-        .collect()
-    )
-
-    streets = (
-        per_postcode.lazy()
-        .group_by("street_key")
-        .agg(
-            pl.col("street").first(),
-            pl.col("locality").first(),
-            pl.col("town_city").first(),
-            pl.col("district").first(),
-            pl.col("county").first(),
-            pl.col("postcode").n_unique().alias("postcode_count"),
-            pl.col("address_count").sum().alias("address_count"),
-            _weighted_mean_expr(density_col, "address_count")
-            .round(1)
-            .cast(pl.Float32)
-            .alias(STREET_TREE_COVERAGE_COL),
-            _weighted_mean_expr(area_col, "address_count")
-            .round(1)
-            .cast(pl.Float32)
-            .alias(f"Street average {area_col}"),
-            _weighted_mean_expr(count_col, "address_count")
-            .round(1)
-            .cast(pl.Float32)
-            .alias(f"Street average {count_col}"),
-            _weighted_mean_expr(height_col, "address_count")
-            .round(1)
-            .cast(pl.Float32)
-            .alias(f"Street average {height_col}"),
-        )
-        .with_columns(
-            _coverage_percentile_expr(
-                STREET_TREE_COVERAGE_COL,
-                STREET_TREE_DENSITY_COL,
-            )
-        )
-        .sort("street_key")
-        .collect()
-    )
-
-    if output_addresses is not None:
-        output_addresses.parent.mkdir(parents=True, exist_ok=True)
-        address_output = addresses.join(
-            streets.lazy().select(
-                "street_key",
-                STREET_TREE_COVERAGE_COL,
-                STREET_TREE_DENSITY_COL,
-            ),
-            on="street_key",
-            how="left",
-        )
-        address_output.sink_parquet(output_addresses, compression="zstd")
-        print(f"Wrote address tree-density join: {output_addresses}")
-
-    if output_streets is not None:
-        output_streets.parent.mkdir(parents=True, exist_ok=True)
-        streets.write_parquet(output_streets, compression="zstd")
-        print(f"Wrote street tree-density rollup: {output_streets}")
-
-
-def _parse_csv_arg(value: str | None) -> tuple[str, ...] | None:
-    if value is None:
-        return None
-    if value.lower() == "all":
-        return None
-    parts = tuple(part.strip() for part in value.split(",") if part.strip())
-    return parts or None
-
-
 def main() -> None:
    parser = argparse.ArgumentParser(
-        description="Build postcode and street tree-density metrics from FR_TOW_V1_ALL.zip"
+        description="Build postcode-level tree-density metrics from FR_TOW_V1_ALL.zip"
    )
    parser.add_argument(
        "--tow-zip",
@ -716,35 +525,17 @@ def main() -> None:
        default=Path("property-data/arcgis_data.parquet"),
        help="Postcode centroid parquet with east1m/north1m columns",
    )
-    parser.add_argument(
-        "--price-paid",
-        type=Path,
-        default=None,
-        help="Optional Price Paid parquet used to roll postcode metrics up to streets",
-    )
    parser.add_argument(
        "--output-postcodes",
        type=Path,
        required=True,
        help="Output postcode-level tree-density parquet",
    )
-    parser.add_argument(
-        "--output-streets",
-        type=Path,
-        default=None,
-        help="Optional output street-level tree-density parquet",
-    )
-    parser.add_argument(
-        "--output-addresses",
-        type=Path,
-        default=None,
-        help="Optional output address/street join parquet keyed by postcode and pp_address",
-    )
    parser.add_argument(
        "--radius-m",
        type=int,
        default=50,
-        help="Radius around each postcode centroid used as the street-scale buffer",
+        help="Radius around each postcode centroid used as the extended buffer",
    )
    parser.add_argument(
        "--layers",
@ -757,12 +548,6 @@ def main() -> None:
        default=65_536,
        help="Arrow batch size for reading TOW features",
    )
-    parser.add_argument(
-        "--workers",
-        type=int,
-        default=-1,
-        help="Worker count passed to scipy cKDTree.query_ball_point",
-    )
    parser.add_argument(
        "--max-postcodes",
        type=int,
@ -783,9 +568,6 @@ def main() -> None:
    )
    args = parser.parse_args()

-    if (args.output_streets or args.output_addresses) and args.price_paid is None:
-        raise SystemExit("--price-paid is required when writing street/address outputs")
-
    if args.radius_m <= 0:
        raise SystemExit("--radius-m must be greater than zero")

@ -797,36 +579,32 @@ def main() -> None:

    n_points = points.height
    canopy_area = np.zeros(n_points, dtype=np.float64)
-    feature_count = np.zeros(n_points, dtype=np.uint32)
    height_weighted_sum = np.zeros(n_points, dtype=np.float64)
    height_weight = np.zeros(n_points, dtype=np.float64)

-    _accumulate_tree_metrics(
+    circles, tree = _postcode_buffers(points, args.radius_m)
+
+    _accumulate_tow_metrics(
        dataset_path=dataset_path,
-        points=points,
-        radius_m=args.radius_m,
+        circles=circles,
+        tree=tree,
+        canopy_area=canopy_area,
+        height_weighted_sum=height_weighted_sum,
+        height_weight=height_weight,
        batch_size=args.batch_size,
        layer_names=layer_names,
        max_features_per_layer=args.max_features_per_layer,
-        workers=args.workers,
-        canopy_area=canopy_area,
-        feature_count=feature_count,
-        height_weighted_sum=height_weighted_sum,
-        height_weight=height_weight,
    )

    if args.nfi_zip is not None and args.nfi_zip.exists():
        nfi_path = _nfi_dataset_path(
            args.nfi_zip, args.nfi_extract_dir, args.force_extract, args.use_vsizip
        )
-        circles, nfi_tree = _postcode_buffers(points, args.radius_m)
        _accumulate_nfi_metrics(
            dataset_path=nfi_path,
            circles=circles,
-            tree=nfi_tree,
+            tree=tree,
            canopy_area=canopy_area,
-            feature_count=feature_count,
-            radius_m=args.radius_m,
            batch_size=args.batch_size,
            max_nfi_features=args.max_nfi_features,
        )
@ -836,7 +614,6 @@ def main() -> None:
    postcode_metrics = _finalize_metrics(
        points,
        canopy_area,
-        feature_count,
        height_weighted_sum,
        height_weight,
        args.radius_m,
@ -849,14 +626,14 @@ def main() -> None:
    postcode_metrics.write_parquet(args.output_postcodes, compression="zstd")
    print(f"\nWrote postcode tree-density metrics: {args.output_postcodes}")

-    if args.price_paid is not None:
-        _write_street_rollups(
-            postcode_metrics=postcode_metrics,
-            price_paid_path=args.price_paid,
-            output_streets=args.output_streets,
-            output_addresses=args.output_addresses,
-            radius_m=args.radius_m,
-        )
+
+def _parse_csv_arg(value: str | None) -> tuple[str, ...] | None:
+    if value is None:
+        return None
+    if value.lower() == "all":
+        return None
+    parts = tuple(part.strip() for part in value.split(",") if part.strip())
+    return parts or None


 if __name__ == "__main__":