idgf

2026-06-02 20:14:32 +01:00 · 2026-06-02 20:14:32 +01:00 · aab85fe32e
commit aab85fe32e
parent fbfebc651c
33 changed files with 2016 additions and 283 deletions
--- a/pipeline/transform/crime_spatial.py
+++ b/pipeline/transform/crime_spatial.py
@ -273,6 +273,28 @@ def _write_avg_yr(
    for type_idx, name in enumerate(ALL_CRIME_TYPES):
        data[f"{name} (avg/yr)"] = avg[:, type_idx]

+    # Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
+    # bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
+    # average over the years in which ANY of those types occurred. This keeps the
+    # headline equal to the mean of the "Serious/Minor crime (by year)" bars.
+    # Summing the per-type avg/yr values instead (as the merge previously did)
+    # divides each type by its OWN years-present and overstates the rollup when a
+    # postcode's serious/minor types occur in disjoint years.
+    for rollup_name, rollup_types in (
+        ("Serious crime", SERIOUS_CRIME_TYPES),
+        ("Minor crime", MINOR_CRIME_TYPES),
+    ):
+        rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
+        rollup_counts = counts[:, rollup_idx, :].sum(axis=1)  # (n_postcodes, n_years)
+        rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
+        rollup_years_present = np.clip(
+            (rollup_counts > 0).sum(axis=1), 1, None
+        ).astype(np.float64)
+        rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
+        data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
+            np.float32
+        )
+
    output_path.parent.mkdir(parents=True, exist_ok=True)
    pl.DataFrame(data).write_parquet(output_path, compression="zstd")
    print(f"Wrote postcode crime averages: {output_path}")