Improve data

This commit is contained in:
Andras Schmelczer 2026-06-10 07:54:25 +01:00
parent b4d66a28c1
commit 85da1941aa
31 changed files with 901 additions and 319 deletions

View file

@ -273,27 +273,24 @@ def _write_avg_yr(
for type_idx, name in enumerate(ALL_CRIME_TYPES):
data[f"{name} (avg/yr)"] = avg[:, type_idx]
# Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
# bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
# average over the years in which ANY of those types occurred. This keeps the
# headline equal to the mean of the "Serious/Minor crime (by year)" bars.
# Summing the per-type avg/yr values instead (as the merge previously did)
# divides each type by its OWN years-present and overstates the rollup when a
# postcode's serious/minor types occur in disjoint years.
# Serious/Minor rollup headlines = the exact SUM of their component (avg/yr)
# columns, so each rollup always equals the sum of the parts shown beside it
# and can never fall below one of its own components. (Previously the rollup
# re-derived a union-years-present mean: it divided the summed counts by the
# number of years in which ANY component type occurred, whereas each
# component divides by its OWN years-present. When a postcode's serious/minor
# types occurred in disjoint years the union denominator was larger, so the
# rollup came out smaller than the sum of its parts.) The by-year rollup
# series in _write_by_year is likewise the per-year sum of the component
# bars, so headline and chart both present the rollup as the sum of its parts.
for rollup_name, rollup_types in (
("Serious crime", SERIOUS_CRIME_TYPES),
("Minor crime", MINOR_CRIME_TYPES),
):
rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
rollup_counts = counts[:, rollup_idx, :].sum(axis=1) # (n_postcodes, n_years)
rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
rollup_years_present = np.clip(
(rollup_counts > 0).sum(axis=1), 1, None
).astype(np.float64)
rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
np.float32
)
data[f"{rollup_name} (avg/yr)"] = np.round(
avg[:, rollup_idx].sum(axis=1), 1
).astype(np.float32)
output_path.parent.mkdir(parents=True, exist_ok=True)
pl.DataFrame(data).write_parquet(output_path, compression="zstd")