scraping and data

2026-05-31 15:36:33 +01:00 · 2026-05-31 15:36:33 +01:00 · 8688b7475e
commit 8688b7475e
parent d98819b569
43 changed files with 4920 additions and 531 deletions
--- a/pipeline/transform/test_tree_density.py
+++ b/pipeline/transform/test_tree_density.py
@ -1,19 +1,83 @@
+import math
 from pathlib import Path

+import numpy as np
 import polars as pl
 import pytest
+import shapely

 from pipeline.transform.tree_density import (
    STREET_TREE_COVERAGE_COL,
    STREET_TREE_DENSITY_COL,
+    _add_nfi_batch,
    _coverage_percentile_expr,
    _metric_columns,
+    _postcode_buffers,
    _postcode_density_percentile_col,
    _with_postcode_density_percentiles,
    _write_street_rollups,
 )


+def test_nfi_accumulation_adds_only_clipped_overlap_area() -> None:
+    radius_m = 50
+    points = pl.DataFrame({"postcode": ["A", "B"], "x": [0.0, 1000.0], "y": [0.0, 0.0]})
+    circles, tree = _postcode_buffers(points, radius_m)
+    buffer_area = math.pi * radius_m * radius_m
+
+    # A large woodland square centred on postcode A fully covers A's circle.
+    canopy_area = np.zeros(2)
+    feature_count = np.zeros(2, dtype=np.uint32)
+    big = shapely.box(-500, -500, 500, 500)  # 1,000,000 sqm parcel
+    _add_nfi_batch(
+        np.array([big], dtype=object),
+        np.array(["Woodland"], dtype=object),
+        circles,
+        tree,
+        canopy_area,
+        feature_count,
+        radius_m,
+    )
+    # Only the clipped circle area is added (the 32-gon buffer approximates the
+    # circle to ~1%), NOT the full 1,000,000 sqm polygon.
+    assert canopy_area[0] == pytest.approx(buffer_area, rel=1e-2)
+    assert canopy_area[0] <= buffer_area  # never exceeds the buffer area
+    assert canopy_area[1] == 0.0  # postcode B is 1km away, no overlap
+    assert feature_count.tolist() == [1, 0]
+
+    # A large parcel that only slivers into B's circle must add only the sliver,
+    # not its full area -- the failure mode the old centroid path could not avoid.
+    canopy_area = np.zeros(2)
+    feature_count = np.zeros(2, dtype=np.uint32)
+    sliver = shapely.box(1040, -500, 2000, 500)  # left edge 10m inside B's circle
+    _add_nfi_batch(
+        np.array([sliver], dtype=object),
+        np.array(["Woodland"], dtype=object),
+        circles,
+        tree,
+        canopy_area,
+        feature_count,
+        radius_m,
+    )
+    assert canopy_area[0] == 0.0
+    assert 0.0 < canopy_area[1] < buffer_area  # tiny segment, far below 1M sqm
+
+    # Non-woodland categories contribute nothing.
+    canopy_area = np.zeros(2)
+    feature_count = np.zeros(2, dtype=np.uint32)
+    _add_nfi_batch(
+        np.array([big], dtype=object),
+        np.array(["Non woodland"], dtype=object),
+        circles,
+        tree,
+        canopy_area,
+        feature_count,
+        radius_m,
+    )
+    assert canopy_area.tolist() == [0.0, 0.0]
+    assert feature_count.tolist() == [0, 0]
+
+
 def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None:
    df = pl.DataFrame({"coverage": [0.0, 5.0, 10.0, None]})