Improve data

This commit is contained in:
Andras Schmelczer 2026-06-10 07:54:25 +01:00
parent b4d66a28c1
commit 85da1941aa
31 changed files with 901 additions and 319 deletions

View file

@ -252,14 +252,15 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
def test_serious_rollup_avg_yr_equals_sum_of_components(tmp_path):
# Two SERIOUS types occur in DISJOINT years for one postcode: Burglary only in
# 2014, Robbery only in 2024 (each a single full month -> 12/yr). The headline
# "Serious crime (avg/yr)" must equal the mean of the "Serious crime (by year)"
# bars (which span the UNION of years any serious type occurred), NOT the sum
# of the per-type means. Summing per-type means divides each type by its OWN
# years-present (1 each) -> 12 + 12 = 24; the consistent rollup divides the
# per-year serious total by the years any serious type occurred (2) -> 12.
# "Serious crime (avg/yr)" must equal the SUM of its component (avg/yr) columns
# (Burglary 12 + Robbery 12 = 24), so the rollup is always the sum of the parts
# shown beside it and can never fall below a single component. (The previous
# union-years-present mean would have divided the per-year serious total by the
# 2 years any serious type occurred, giving a misleading 12 that sits below
# both the burglary and robbery rollup contributions.)
units = tmp_path / "units"
_write_boundaries(
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
@ -274,13 +275,16 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
avg = pl.read_parquet(output).row(0, named=True)
# The precomputed rollup headline exists and equals the mean of the bars (12),
# not the sum of the per-type avg/yr values (Burglary 12 + Robbery 12 = 24).
assert "Serious crime (avg/yr)" in avg
assert avg["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
assert avg["Robbery (avg/yr)"] == pytest.approx(12.0, abs=0.05)
assert avg["Serious crime (avg/yr)"] == pytest.approx(12.0, abs=0.05)
# Rollup == sum of its component (avg/yr) columns.
assert avg["Serious crime (avg/yr)"] == pytest.approx(24.0, abs=0.05)
assert avg["Serious crime (avg/yr)"] == pytest.approx(
avg["Burglary (avg/yr)"] + avg["Robbery (avg/yr)"], abs=0.05
)
# The by-year rollup series remains the per-year sum of the component bars.
serious_bars = {
p["year"]: p["count"]
for p in pl.read_parquet(by_year).row(0, named=True)["Serious crime (by year)"]
@ -289,8 +293,6 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
2014: pytest.approx(12.0, abs=0.05),
2024: pytest.approx(12.0, abs=0.05),
}
mean_of_bars = sum(serious_bars.values()) / len(serious_bars)
assert avg["Serious crime (avg/yr)"] == pytest.approx(mean_of_bars, abs=0.05)
def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):