Improve data

2026-06-10 07:54:25 +01:00 · 2026-06-10 07:54:25 +01:00 · 85da1941aa
commit 85da1941aa
parent b4d66a28c1
31 changed files with 901 additions and 319 deletions
--- a/pipeline/transform/crime_spatial.py
+++ b/pipeline/transform/crime_spatial.py
@ -273,27 +273,24 @@ def _write_avg_yr(
    for type_idx, name in enumerate(ALL_CRIME_TYPES):
        data[f"{name} (avg/yr)"] = avg[:, type_idx]

-    # Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
-    # bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
-    # average over the years in which ANY of those types occurred. This keeps the
-    # headline equal to the mean of the "Serious/Minor crime (by year)" bars.
-    # Summing the per-type avg/yr values instead (as the merge previously did)
-    # divides each type by its OWN years-present and overstates the rollup when a
-    # postcode's serious/minor types occur in disjoint years.
+    # Serious/Minor rollup headlines = the exact SUM of their component (avg/yr)
+    # columns, so each rollup always equals the sum of the parts shown beside it
+    # and can never fall below one of its own components. (Previously the rollup
+    # re-derived a union-years-present mean: it divided the summed counts by the
+    # number of years in which ANY component type occurred, whereas each
+    # component divides by its OWN years-present. When a postcode's serious/minor
+    # types occurred in disjoint years the union denominator was larger, so the
+    # rollup came out smaller than the sum of its parts.) The by-year rollup
+    # series in _write_by_year is likewise the per-year sum of the component
+    # bars, so headline and chart both present the rollup as the sum of its parts.
    for rollup_name, rollup_types in (
        ("Serious crime", SERIOUS_CRIME_TYPES),
        ("Minor crime", MINOR_CRIME_TYPES),
    ):
        rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
-        rollup_counts = counts[:, rollup_idx, :].sum(axis=1)  # (n_postcodes, n_years)
-        rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
-        rollup_years_present = np.clip(
-            (rollup_counts > 0).sum(axis=1), 1, None
-        ).astype(np.float64)
-        rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
-        data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
-            np.float32
-        )
+        data[f"{rollup_name} (avg/yr)"] = np.round(
+            avg[:, rollup_idx].sum(axis=1), 1
+        ).astype(np.float32)

    output_path.parent.mkdir(parents=True, exist_ok=True)
    pl.DataFrame(data).write_parquet(output_path, compression="zstd")