scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

View file

@ -1,19 +1,83 @@
import math
from pathlib import Path
import numpy as np
import polars as pl
import pytest
import shapely
from pipeline.transform.tree_density import (
STREET_TREE_COVERAGE_COL,
STREET_TREE_DENSITY_COL,
_add_nfi_batch,
_coverage_percentile_expr,
_metric_columns,
_postcode_buffers,
_postcode_density_percentile_col,
_with_postcode_density_percentiles,
_write_street_rollups,
)
def test_nfi_accumulation_adds_only_clipped_overlap_area() -> None:
radius_m = 50
points = pl.DataFrame({"postcode": ["A", "B"], "x": [0.0, 1000.0], "y": [0.0, 0.0]})
circles, tree = _postcode_buffers(points, radius_m)
buffer_area = math.pi * radius_m * radius_m
# A large woodland square centred on postcode A fully covers A's circle.
canopy_area = np.zeros(2)
feature_count = np.zeros(2, dtype=np.uint32)
big = shapely.box(-500, -500, 500, 500) # 1,000,000 sqm parcel
_add_nfi_batch(
np.array([big], dtype=object),
np.array(["Woodland"], dtype=object),
circles,
tree,
canopy_area,
feature_count,
radius_m,
)
# Only the clipped circle area is added (the 32-gon buffer approximates the
# circle to ~1%), NOT the full 1,000,000 sqm polygon.
assert canopy_area[0] == pytest.approx(buffer_area, rel=1e-2)
assert canopy_area[0] <= buffer_area # never exceeds the buffer area
assert canopy_area[1] == 0.0 # postcode B is 1km away, no overlap
assert feature_count.tolist() == [1, 0]
# A large parcel that only slivers into B's circle must add only the sliver,
# not its full area -- the failure mode the old centroid path could not avoid.
canopy_area = np.zeros(2)
feature_count = np.zeros(2, dtype=np.uint32)
sliver = shapely.box(1040, -500, 2000, 500) # left edge 10m inside B's circle
_add_nfi_batch(
np.array([sliver], dtype=object),
np.array(["Woodland"], dtype=object),
circles,
tree,
canopy_area,
feature_count,
radius_m,
)
assert canopy_area[0] == 0.0
assert 0.0 < canopy_area[1] < buffer_area # tiny segment, far below 1M sqm
# Non-woodland categories contribute nothing.
canopy_area = np.zeros(2)
feature_count = np.zeros(2, dtype=np.uint32)
_add_nfi_batch(
np.array([big], dtype=object),
np.array(["Non woodland"], dtype=object),
circles,
tree,
canopy_area,
feature_count,
radius_m,
)
assert canopy_area.tolist() == [0.0, 0.0]
assert feature_count.tolist() == [0, 0]
def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None:
df = pl.DataFrame({"coverage": [0.0, 5.0, 10.0, None]})