import math from pathlib import Path import numpy as np import polars as pl import pytest import shapely from pipeline.transform.tree_density import ( STREET_TREE_COVERAGE_COL, STREET_TREE_DENSITY_COL, _add_nfi_batch, _coverage_percentile_expr, _metric_columns, _postcode_buffers, _postcode_density_percentile_col, _with_postcode_density_percentiles, _write_street_rollups, ) def test_nfi_accumulation_adds_only_clipped_overlap_area() -> None: radius_m = 50 points = pl.DataFrame({"postcode": ["A", "B"], "x": [0.0, 1000.0], "y": [0.0, 0.0]}) circles, tree = _postcode_buffers(points, radius_m) buffer_area = math.pi * radius_m * radius_m # A large woodland square centred on postcode A fully covers A's circle. canopy_area = np.zeros(2) feature_count = np.zeros(2, dtype=np.uint32) big = shapely.box(-500, -500, 500, 500) # 1,000,000 sqm parcel _add_nfi_batch( np.array([big], dtype=object), np.array(["Woodland"], dtype=object), circles, tree, canopy_area, feature_count, radius_m, ) # Only the clipped circle area is added (the 32-gon buffer approximates the # circle to ~1%), NOT the full 1,000,000 sqm polygon. assert canopy_area[0] == pytest.approx(buffer_area, rel=1e-2) assert canopy_area[0] <= buffer_area # never exceeds the buffer area assert canopy_area[1] == 0.0 # postcode B is 1km away, no overlap assert feature_count.tolist() == [1, 0] # A large parcel that only slivers into B's circle must add only the sliver, # not its full area -- the failure mode the old centroid path could not avoid. canopy_area = np.zeros(2) feature_count = np.zeros(2, dtype=np.uint32) sliver = shapely.box(1040, -500, 2000, 500) # left edge 10m inside B's circle _add_nfi_batch( np.array([sliver], dtype=object), np.array(["Woodland"], dtype=object), circles, tree, canopy_area, feature_count, radius_m, ) assert canopy_area[0] == 0.0 assert 0.0 < canopy_area[1] < buffer_area # tiny segment, far below 1M sqm # Non-woodland categories contribute nothing. canopy_area = np.zeros(2) feature_count = np.zeros(2, dtype=np.uint32) _add_nfi_batch( np.array([big], dtype=object), np.array(["Non woodland"], dtype=object), circles, tree, canopy_area, feature_count, radius_m, ) assert canopy_area.tolist() == [0.0, 0.0] assert feature_count.tolist() == [0, 0] def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None: df = pl.DataFrame({"coverage": [0.0, 5.0, 10.0, None]}) result = df.lazy().with_columns( _coverage_percentile_expr("coverage", "percentile") ).collect() assert result["percentile"].to_list() == [0.0, 50.0, 100.0, None] def test_coverage_percentile_expr_uses_exact_scale_endpoints() -> None: df = pl.DataFrame({"coverage": [0.0, 0.0, 5.0, 10.0, 10.0]}) result = df.lazy().with_columns( _coverage_percentile_expr("coverage", "percentile") ).collect() assert result["percentile"].to_list() == [0.0, 0.0, 50.0, 100.0, 100.0] def test_street_rollup_percentiles_are_ranked_over_raw_street_coverage( tmp_path: Path, ) -> None: radius_m = 50 density_col, area_col, count_col, height_col = _metric_columns(radius_m) percentile_col = _postcode_density_percentile_col(radius_m) postcode_metrics = _with_postcode_density_percentiles( pl.DataFrame( { "postcode": ["AA1 1AA", "AA1 1AB", "AA1 1AC"], density_col: [10.0, 30.0, 50.0], area_col: [100.0, 300.0, 500.0], count_col: [1, 3, 5], height_col: [4.0, 6.0, 8.0], } ), radius_m, ) price_paid = pl.DataFrame( { "postcode": ["AA1 1AA", "AA1 1AA", "AA1 1AB", "AA1 1AC"], "paon": ["1", "2", "3", "4"], "saon": ["", "", "", ""], "street": ["Oak Road", "Oak Road", "Oak Road", "Elm Street"], "locality": ["", "", "", ""], "town_city": ["Test Town", "Test Town", "Test Town", "Test Town"], "district": ["Test District"] * 4, "county": ["Test County"] * 4, "date_of_transfer": [ "2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04", ], } ) price_paid_path = tmp_path / "price-paid.parquet" output_streets = tmp_path / "streets.parquet" output_addresses = tmp_path / "addresses.parquet" price_paid.write_parquet(price_paid_path) _write_street_rollups( postcode_metrics=postcode_metrics, price_paid_path=price_paid_path, output_streets=output_streets, output_addresses=output_addresses, radius_m=radius_m, ) streets = pl.read_parquet(output_streets).sort("street") addresses = pl.read_parquet(output_addresses) assert streets["street"].to_list() == ["Elm Street", "Oak Road"] assert streets[STREET_TREE_COVERAGE_COL].to_list() == pytest.approx([50.0, 16.7]) assert streets.select("street", STREET_TREE_DENSITY_COL).rows() == [ ("Elm Street", 100.0), ("Oak Road", 0.0), ] assert percentile_col in addresses.columns assert STREET_TREE_COVERAGE_COL in addresses.columns assert STREET_TREE_DENSITY_COL in addresses.columns